# for data manipulation and analysis
import numpy as np
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
# for visualisation
import seaborn as sns
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
df = pd.read_csv('/Users/suryanshjamwal/Desktop/Analytics Case studies/Spotify/Spotify_Youtube.csv')
df
| Unnamed: 0 | Artist | Url_spotify | Track | Album | Album_type | Uri | Danceability | Energy | Key | ... | Url_youtube | Title | Channel | Views | Likes | Comments | Description | Licensed | official_video | Stream | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Gorillaz | https://open.spotify.com/artist/3AA28KZvwAUcZu... | Feel Good Inc. | Demon Days | album | spotify:track:0d28khcov6AiegSCpG5TuT | 0.818 | 0.705 | 6.0 | ... | https://www.youtube.com/watch?v=HyHNuVaZJ-k | Gorillaz - Feel Good Inc. (Official Video) | Gorillaz | 693555221.0 | 6220896.0 | 169907.0 | Official HD Video for Gorillaz' fantastic trac... | True | True | 1.040235e+09 |
| 1 | 1 | Gorillaz | https://open.spotify.com/artist/3AA28KZvwAUcZu... | Rhinestone Eyes | Plastic Beach | album | spotify:track:1foMv2HQwfQ2vntFf9HFeG | 0.676 | 0.703 | 8.0 | ... | https://www.youtube.com/watch?v=yYDmaexVHic | Gorillaz - Rhinestone Eyes [Storyboard Film] (... | Gorillaz | 72011645.0 | 1079128.0 | 31003.0 | The official video for Gorillaz - Rhinestone E... | True | True | 3.100837e+08 |
| 2 | 2 | Gorillaz | https://open.spotify.com/artist/3AA28KZvwAUcZu... | New Gold (feat. Tame Impala and Bootie Brown) | New Gold (feat. Tame Impala and Bootie Brown) | single | spotify:track:64dLd6rVqDLtkXFYrEUHIU | 0.695 | 0.923 | 1.0 | ... | https://www.youtube.com/watch?v=qJa-VFwPpYA | Gorillaz - New Gold ft. Tame Impala & Bootie B... | Gorillaz | 8435055.0 | 282142.0 | 7399.0 | Gorillaz - New Gold ft. Tame Impala & Bootie B... | True | True | 6.306347e+07 |
| 3 | 3 | Gorillaz | https://open.spotify.com/artist/3AA28KZvwAUcZu... | On Melancholy Hill | Plastic Beach | album | spotify:track:0q6LuUqGLUiCPP1cbdwFs3 | 0.689 | 0.739 | 2.0 | ... | https://www.youtube.com/watch?v=04mfKJWDSzI | Gorillaz - On Melancholy Hill (Official Video) | Gorillaz | 211754952.0 | 1788577.0 | 55229.0 | Follow Gorillaz online:\nhttp://gorillaz.com \... | True | True | 4.346636e+08 |
| 4 | 4 | Gorillaz | https://open.spotify.com/artist/3AA28KZvwAUcZu... | Clint Eastwood | Gorillaz | album | spotify:track:7yMiX7n9SBvadzox8T5jzT | 0.663 | 0.694 | 10.0 | ... | https://www.youtube.com/watch?v=1V_xRb0x9aw | Gorillaz - Clint Eastwood (Official Video) | Gorillaz | 618480958.0 | 6197318.0 | 155930.0 | The official music video for Gorillaz - Clint ... | True | True | 6.172597e+08 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20713 | 20713 | SICK LEGEND | https://open.spotify.com/artist/3EYY5FwDkHEYLw... | JUST DANCE HARDSTYLE | JUST DANCE HARDSTYLE | single | spotify:track:0RtcKQGyI4hr8FgFH1TuYG | 0.582 | 0.926 | 5.0 | ... | https://www.youtube.com/watch?v=5SHmKFKlNqI | JUST DANCE HARDSTYLE | SICK LEGEND - Topic | 71678.0 | 1113.0 | 0.0 | Provided to YouTube by Routenote\n\nJUST DANCE... | True | True | 9.227144e+06 |
| 20714 | 20714 | SICK LEGEND | https://open.spotify.com/artist/3EYY5FwDkHEYLw... | SET FIRE TO THE RAIN HARDSTYLE | SET FIRE TO THE RAIN HARDSTYLE | single | spotify:track:3rHvPA8lUnPBkaLyPOc0VV | 0.531 | 0.936 | 4.0 | ... | https://www.youtube.com/watch?v=ocTH6KxllDQ | SET FIRE TO THE RAIN HARDSTYLE | SICK LEGEND - Topic | 164741.0 | 2019.0 | 0.0 | Provided to YouTube by Routenote\n\nSET FIRE T... | True | True | 1.089818e+07 |
| 20715 | 20715 | SICK LEGEND | https://open.spotify.com/artist/3EYY5FwDkHEYLw... | OUTSIDE HARDSTYLE SPED UP | OUTSIDE HARDSTYLE SPED UP | single | spotify:track:4jk00YxPtPbhvHJE9N4ddv | 0.443 | 0.830 | 4.0 | ... | https://www.youtube.com/watch?v=5wFhE-HY0hg | OUTSIDE HARDSTYLE SPED UP | SICK LEGEND - Topic | 35646.0 | 329.0 | 0.0 | Provided to YouTube by Routenote\n\nOUTSIDE HA... | True | True | 6.226110e+06 |
| 20716 | 20716 | SICK LEGEND | https://open.spotify.com/artist/3EYY5FwDkHEYLw... | ONLY GIRL HARDSTYLE | ONLY GIRL HARDSTYLE | single | spotify:track:5EyErbpsugWliX006eTDex | 0.417 | 0.767 | 9.0 | ... | https://www.youtube.com/watch?v=VMFLbFRNCn0 | ONLY GIRL HARDSTYLE | SICK LEGEND - Topic | 6533.0 | 88.0 | 0.0 | Provided to YouTube by Routenote\n\nONLY GIRL ... | True | True | 6.873961e+06 |
| 20717 | 20717 | SICK LEGEND | https://open.spotify.com/artist/3EYY5FwDkHEYLw... | MISS YOU HARDSTYLE | MISS YOU HARDSTYLE | single | spotify:track:6lOn0jz1QpjcWeXo1oMm0k | 0.498 | 0.938 | 6.0 | ... | https://www.youtube.com/watch?v=zau0dckCFi0 | MISS YOU HARDSTYLE | SICK LEGEND - Topic | 158697.0 | 2484.0 | 0.0 | Provided to YouTube by Routenote\n\nMISS YOU H... | True | True | 5.695584e+06 |
20718 rows × 28 columns
df.head()
| Unnamed: 0 | Artist | Url_spotify | Track | Album | Album_type | Uri | Danceability | Energy | Key | ... | Url_youtube | Title | Channel | Views | Likes | Comments | Description | Licensed | official_video | Stream | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Gorillaz | https://open.spotify.com/artist/3AA28KZvwAUcZu... | Feel Good Inc. | Demon Days | album | spotify:track:0d28khcov6AiegSCpG5TuT | 0.818 | 0.705 | 6.0 | ... | https://www.youtube.com/watch?v=HyHNuVaZJ-k | Gorillaz - Feel Good Inc. (Official Video) | Gorillaz | 693555221.0 | 6220896.0 | 169907.0 | Official HD Video for Gorillaz' fantastic trac... | True | True | 1.040235e+09 |
| 1 | 1 | Gorillaz | https://open.spotify.com/artist/3AA28KZvwAUcZu... | Rhinestone Eyes | Plastic Beach | album | spotify:track:1foMv2HQwfQ2vntFf9HFeG | 0.676 | 0.703 | 8.0 | ... | https://www.youtube.com/watch?v=yYDmaexVHic | Gorillaz - Rhinestone Eyes [Storyboard Film] (... | Gorillaz | 72011645.0 | 1079128.0 | 31003.0 | The official video for Gorillaz - Rhinestone E... | True | True | 3.100837e+08 |
| 2 | 2 | Gorillaz | https://open.spotify.com/artist/3AA28KZvwAUcZu... | New Gold (feat. Tame Impala and Bootie Brown) | New Gold (feat. Tame Impala and Bootie Brown) | single | spotify:track:64dLd6rVqDLtkXFYrEUHIU | 0.695 | 0.923 | 1.0 | ... | https://www.youtube.com/watch?v=qJa-VFwPpYA | Gorillaz - New Gold ft. Tame Impala & Bootie B... | Gorillaz | 8435055.0 | 282142.0 | 7399.0 | Gorillaz - New Gold ft. Tame Impala & Bootie B... | True | True | 6.306347e+07 |
| 3 | 3 | Gorillaz | https://open.spotify.com/artist/3AA28KZvwAUcZu... | On Melancholy Hill | Plastic Beach | album | spotify:track:0q6LuUqGLUiCPP1cbdwFs3 | 0.689 | 0.739 | 2.0 | ... | https://www.youtube.com/watch?v=04mfKJWDSzI | Gorillaz - On Melancholy Hill (Official Video) | Gorillaz | 211754952.0 | 1788577.0 | 55229.0 | Follow Gorillaz online:\nhttp://gorillaz.com \... | True | True | 4.346636e+08 |
| 4 | 4 | Gorillaz | https://open.spotify.com/artist/3AA28KZvwAUcZu... | Clint Eastwood | Gorillaz | album | spotify:track:7yMiX7n9SBvadzox8T5jzT | 0.663 | 0.694 | 10.0 | ... | https://www.youtube.com/watch?v=1V_xRb0x9aw | Gorillaz - Clint Eastwood (Official Video) | Gorillaz | 618480958.0 | 6197318.0 | 155930.0 | The official music video for Gorillaz - Clint ... | True | True | 6.172597e+08 |
5 rows × 28 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20718 entries, 0 to 20717 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 20718 non-null int64 1 Artist 20718 non-null object 2 Url_spotify 20718 non-null object 3 Track 20718 non-null object 4 Album 20718 non-null object 5 Album_type 20718 non-null object 6 Uri 20718 non-null object 7 Danceability 20716 non-null float64 8 Energy 20716 non-null float64 9 Key 20716 non-null float64 10 Loudness 20716 non-null float64 11 Speechiness 20716 non-null float64 12 Acousticness 20716 non-null float64 13 Instrumentalness 20716 non-null float64 14 Liveness 20716 non-null float64 15 Valence 20716 non-null float64 16 Tempo 20716 non-null float64 17 Duration_ms 20716 non-null float64 18 Url_youtube 20248 non-null object 19 Title 20248 non-null object 20 Channel 20248 non-null object 21 Views 20248 non-null float64 22 Likes 20177 non-null float64 23 Comments 20149 non-null float64 24 Description 19842 non-null object 25 Licensed 20248 non-null object 26 official_video 20248 non-null object 27 Stream 20142 non-null float64 dtypes: float64(15), int64(1), object(12) memory usage: 4.4+ MB
df = df.drop(['Unnamed: 0', 'Title', 'Url_spotify', 'Uri', 'Description', 'official_video', 'Channel', 'Url_youtube'], axis = 1)
df.columns
Index(['Artist', 'Track', 'Album', 'Album_type', 'Danceability', 'Energy',
'Key', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness',
'Liveness', 'Valence', 'Tempo', 'Duration_ms', 'Views', 'Likes',
'Comments', 'Licensed', 'Stream'],
dtype='object')
df = df.dropna()
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 19549 entries, 0 to 20717 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Artist 19549 non-null object 1 Track 19549 non-null object 2 Album 19549 non-null object 3 Album_type 19549 non-null object 4 Danceability 19549 non-null float64 5 Energy 19549 non-null float64 6 Key 19549 non-null float64 7 Loudness 19549 non-null float64 8 Speechiness 19549 non-null float64 9 Acousticness 19549 non-null float64 10 Instrumentalness 19549 non-null float64 11 Liveness 19549 non-null float64 12 Valence 19549 non-null float64 13 Tempo 19549 non-null float64 14 Duration_ms 19549 non-null float64 15 Views 19549 non-null float64 16 Likes 19549 non-null float64 17 Comments 19549 non-null float64 18 Licensed 19549 non-null object 19 Stream 19549 non-null float64 dtypes: float64(15), object(5) memory usage: 3.1+ MB
df.describe()
| Danceability | Energy | Key | Loudness | Speechiness | Acousticness | Instrumentalness | Liveness | Valence | Tempo | Duration_ms | Views | Likes | Comments | Stream | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 19549.000000 | 19549.000000 | 19549.000000 | 19549.000000 | 19549.000000 | 19549.000000 | 19549.000000 | 19549.000000 | 19549.000000 | 19549.000000 | 1.954900e+04 | 1.954900e+04 | 1.954900e+04 | 1.954900e+04 | 1.954900e+04 |
| mean | 0.621059 | 0.635170 | 5.294337 | -7.633179 | 0.095392 | 0.289106 | 0.055292 | 0.191226 | 0.528950 | 120.605702 | 2.246281e+05 | 9.545626e+07 | 6.700487e+05 | 2.786371e+04 | 1.371101e+08 |
| std | 0.165489 | 0.213555 | 3.579338 | 4.618839 | 0.106243 | 0.285908 | 0.192519 | 0.165197 | 0.245228 | 29.619340 | 1.269126e+05 | 2.775744e+08 | 1.805054e+06 | 1.959074e+05 | 2.463589e+08 |
| min | 0.000000 | 0.000020 | 0.000000 | -46.251000 | 0.000000 | 0.000001 | 0.000000 | 0.014500 | 0.000000 | 0.000000 | 3.098500e+04 | 2.600000e+01 | 0.000000e+00 | 0.000000e+00 | 6.574000e+03 |
| 25% | 0.519000 | 0.508000 | 2.000000 | -8.772000 | 0.035700 | 0.044400 | 0.000000 | 0.094000 | 0.339000 | 96.990000 | 1.802400e+05 | 1.911528e+06 | 2.238000e+04 | 5.310000e+02 | 1.781089e+07 |
| 50% | 0.639000 | 0.666000 | 5.000000 | -6.516000 | 0.050700 | 0.190000 | 0.000002 | 0.125000 | 0.536000 | 119.964000 | 2.132530e+05 | 1.491440e+07 | 1.279090e+05 | 3.343000e+03 | 4.979139e+07 |
| 75% | 0.742000 | 0.797000 | 8.000000 | -4.929000 | 0.104000 | 0.470000 | 0.000433 | 0.234000 | 0.725000 | 139.951000 | 2.519200e+05 | 7.152989e+07 | 5.266400e+05 | 1.449300e+04 | 1.390828e+08 |
| max | 0.975000 | 1.000000 | 11.000000 | 0.920000 | 0.964000 | 0.996000 | 1.000000 | 1.000000 | 0.993000 | 243.372000 | 4.676058e+06 | 8.079649e+09 | 5.078865e+07 | 1.608314e+07 | 3.386520e+09 |
df = df.drop_duplicates(['Track'])
df
| Artist | Track | Album | Album_type | Danceability | Energy | Key | Loudness | Speechiness | Acousticness | Instrumentalness | Liveness | Valence | Tempo | Duration_ms | Views | Likes | Comments | Licensed | Stream | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Gorillaz | Feel Good Inc. | Demon Days | album | 0.818 | 0.705 | 6.0 | -6.679 | 0.1770 | 0.008360 | 0.002330 | 0.6130 | 0.7720 | 138.559 | 222640.0 | 693555221.0 | 6220896.0 | 169907.0 | True | 1.040235e+09 |
| 1 | Gorillaz | Rhinestone Eyes | Plastic Beach | album | 0.676 | 0.703 | 8.0 | -5.815 | 0.0302 | 0.086900 | 0.000687 | 0.0463 | 0.8520 | 92.761 | 200173.0 | 72011645.0 | 1079128.0 | 31003.0 | True | 3.100837e+08 |
| 2 | Gorillaz | New Gold (feat. Tame Impala and Bootie Brown) | New Gold (feat. Tame Impala and Bootie Brown) | single | 0.695 | 0.923 | 1.0 | -3.930 | 0.0522 | 0.042500 | 0.046900 | 0.1160 | 0.5510 | 108.014 | 215150.0 | 8435055.0 | 282142.0 | 7399.0 | True | 6.306347e+07 |
| 3 | Gorillaz | On Melancholy Hill | Plastic Beach | album | 0.689 | 0.739 | 2.0 | -5.810 | 0.0260 | 0.000015 | 0.509000 | 0.0640 | 0.5780 | 120.423 | 233867.0 | 211754952.0 | 1788577.0 | 55229.0 | True | 4.346636e+08 |
| 4 | Gorillaz | Clint Eastwood | Gorillaz | album | 0.663 | 0.694 | 10.0 | -8.627 | 0.1710 | 0.025300 | 0.000000 | 0.0698 | 0.5250 | 167.953 | 340920.0 | 618480958.0 | 6197318.0 | 155930.0 | True | 6.172597e+08 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20713 | SICK LEGEND | JUST DANCE HARDSTYLE | JUST DANCE HARDSTYLE | single | 0.582 | 0.926 | 5.0 | -6.344 | 0.0328 | 0.448000 | 0.000000 | 0.0839 | 0.6580 | 90.002 | 94667.0 | 71678.0 | 1113.0 | 0.0 | True | 9.227144e+06 |
| 20714 | SICK LEGEND | SET FIRE TO THE RAIN HARDSTYLE | SET FIRE TO THE RAIN HARDSTYLE | single | 0.531 | 0.936 | 4.0 | -1.786 | 0.1370 | 0.028000 | 0.000000 | 0.0923 | 0.6570 | 174.869 | 150857.0 | 164741.0 | 2019.0 | 0.0 | True | 1.089818e+07 |
| 20715 | SICK LEGEND | OUTSIDE HARDSTYLE SPED UP | OUTSIDE HARDSTYLE SPED UP | single | 0.443 | 0.830 | 4.0 | -4.679 | 0.0647 | 0.024300 | 0.000000 | 0.1540 | 0.4190 | 168.388 | 136842.0 | 35646.0 | 329.0 | 0.0 | True | 6.226110e+06 |
| 20716 | SICK LEGEND | ONLY GIRL HARDSTYLE | ONLY GIRL HARDSTYLE | single | 0.417 | 0.767 | 9.0 | -4.004 | 0.4190 | 0.356000 | 0.018400 | 0.1080 | 0.5390 | 155.378 | 108387.0 | 6533.0 | 88.0 | 0.0 | True | 6.873961e+06 |
| 20717 | SICK LEGEND | MISS YOU HARDSTYLE | MISS YOU HARDSTYLE | single | 0.498 | 0.938 | 6.0 | -4.543 | 0.1070 | 0.002770 | 0.911000 | 0.1360 | 0.0787 | 160.067 | 181500.0 | 158697.0 | 2484.0 | 0.0 | True | 5.695584e+06 |
16866 rows × 20 columns
numeric_df = df.select_dtypes(include=np.number)
numeric_df
| Danceability | Energy | Key | Loudness | Speechiness | Acousticness | Instrumentalness | Liveness | Valence | Tempo | Duration_ms | Views | Likes | Comments | Stream | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.818 | 0.705 | 6.0 | -6.679 | 0.1770 | 0.008360 | 0.002330 | 0.6130 | 0.7720 | 138.559 | 222640.0 | 693555221.0 | 6220896.0 | 169907.0 | 1.040235e+09 |
| 1 | 0.676 | 0.703 | 8.0 | -5.815 | 0.0302 | 0.086900 | 0.000687 | 0.0463 | 0.8520 | 92.761 | 200173.0 | 72011645.0 | 1079128.0 | 31003.0 | 3.100837e+08 |
| 2 | 0.695 | 0.923 | 1.0 | -3.930 | 0.0522 | 0.042500 | 0.046900 | 0.1160 | 0.5510 | 108.014 | 215150.0 | 8435055.0 | 282142.0 | 7399.0 | 6.306347e+07 |
| 3 | 0.689 | 0.739 | 2.0 | -5.810 | 0.0260 | 0.000015 | 0.509000 | 0.0640 | 0.5780 | 120.423 | 233867.0 | 211754952.0 | 1788577.0 | 55229.0 | 4.346636e+08 |
| 4 | 0.663 | 0.694 | 10.0 | -8.627 | 0.1710 | 0.025300 | 0.000000 | 0.0698 | 0.5250 | 167.953 | 340920.0 | 618480958.0 | 6197318.0 | 155930.0 | 6.172597e+08 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20713 | 0.582 | 0.926 | 5.0 | -6.344 | 0.0328 | 0.448000 | 0.000000 | 0.0839 | 0.6580 | 90.002 | 94667.0 | 71678.0 | 1113.0 | 0.0 | 9.227144e+06 |
| 20714 | 0.531 | 0.936 | 4.0 | -1.786 | 0.1370 | 0.028000 | 0.000000 | 0.0923 | 0.6570 | 174.869 | 150857.0 | 164741.0 | 2019.0 | 0.0 | 1.089818e+07 |
| 20715 | 0.443 | 0.830 | 4.0 | -4.679 | 0.0647 | 0.024300 | 0.000000 | 0.1540 | 0.4190 | 168.388 | 136842.0 | 35646.0 | 329.0 | 0.0 | 6.226110e+06 |
| 20716 | 0.417 | 0.767 | 9.0 | -4.004 | 0.4190 | 0.356000 | 0.018400 | 0.1080 | 0.5390 | 155.378 | 108387.0 | 6533.0 | 88.0 | 0.0 | 6.873961e+06 |
| 20717 | 0.498 | 0.938 | 6.0 | -4.543 | 0.1070 | 0.002770 | 0.911000 | 0.1360 | 0.0787 | 160.067 | 181500.0 | 158697.0 | 2484.0 | 0.0 | 5.695584e+06 |
16866 rows × 15 columns
numeric_df.describe()
| Danceability | Energy | Key | Loudness | Speechiness | Acousticness | Instrumentalness | Liveness | Valence | Tempo | Duration_ms | Views | Likes | Comments | Stream | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 16866.000000 | 16866.000000 | 16866.000000 | 16866.000000 | 16866.000000 | 16866.000000 | 16866.000000 | 16866.000000 | 16866.000000 | 16866.000000 | 1.686600e+04 | 1.686600e+04 | 1.686600e+04 | 1.686600e+04 | 1.686600e+04 |
| mean | 0.619050 | 0.635350 | 5.279912 | -7.677653 | 0.094880 | 0.287854 | 0.056587 | 0.191661 | 0.530527 | 120.676194 | 2.226875e+05 | 8.777338e+07 | 6.102794e+05 | 2.543879e+04 | 1.300308e+08 |
| std | 0.165577 | 0.215155 | 3.574273 | 4.637705 | 0.107337 | 0.287661 | 0.193770 | 0.165636 | 0.246654 | 29.685312 | 9.287362e+04 | 2.562711e+08 | 1.644068e+06 | 1.517489e+05 | 2.330323e+08 |
| min | 0.000000 | 0.000020 | 0.000000 | -46.251000 | 0.000000 | 0.000001 | 0.000000 | 0.014500 | 0.000000 | 0.000000 | 3.098500e+04 | 2.600000e+01 | 0.000000e+00 | 0.000000e+00 | 6.574000e+03 |
| 25% | 0.515000 | 0.505000 | 2.000000 | -8.874750 | 0.035400 | 0.041500 | 0.000000 | 0.094000 | 0.339000 | 96.963750 | 1.808240e+05 | 1.818648e+06 | 2.134500e+04 | 5.100000e+02 | 1.695603e+07 |
| 50% | 0.636000 | 0.666000 | 5.000000 | -6.551500 | 0.050200 | 0.186000 | 0.000003 | 0.124000 | 0.538000 | 119.969000 | 2.134770e+05 | 1.381934e+07 | 1.173160e+05 | 3.137000e+03 | 4.813189e+07 |
| 75% | 0.741000 | 0.800000 | 8.000000 | -4.946000 | 0.103000 | 0.472000 | 0.000517 | 0.236000 | 0.729000 | 139.982000 | 2.522248e+05 | 6.650796e+07 | 4.807730e+05 | 1.351250e+04 | 1.328671e+08 |
| max | 0.975000 | 1.000000 | 11.000000 | 0.920000 | 0.964000 | 0.996000 | 1.000000 | 1.000000 | 0.993000 | 243.372000 | 4.676058e+06 | 8.079647e+09 | 5.078863e+07 | 9.131761e+06 | 3.386520e+09 |
plt.figure(figsize=(16, 10))
heatmap = sns.heatmap(numeric_df.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':19}, pad=12);
plt.savefig('heatmap.png', dpi=550, bbox_inches='tight')
# Using spark.sql to query dataframe
spark = SparkSession.builder.appName(
"pyspark.sql").getOrCreate()
df_spark = spark.createDataFrame(df)
df_spark.show()
Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 23/05/02 18:28:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable [Stage 0:> (0 + 1) / 1]
+--------------------+--------------------+--------------------+----------+------------+------+----+--------+-----------+------------+----------------+--------+-------+-------+-----------+-------------+---------+--------+--------+-------------+ | Artist| Track| Album|Album_type|Danceability|Energy| Key|Loudness|Speechiness|Acousticness|Instrumentalness|Liveness|Valence| Tempo|Duration_ms| Views| Likes|Comments|Licensed| Stream| +--------------------+--------------------+--------------------+----------+------------+------+----+--------+-----------+------------+----------------+--------+-------+-------+-----------+-------------+---------+--------+--------+-------------+ | Gorillaz| Feel Good Inc.| Demon Days| album| 0.818| 0.705| 6.0| -6.679| 0.177| 0.00836| 0.00233| 0.613| 0.772|138.559| 222640.0| 6.93555221E8|6220896.0|169907.0| true|1.040234854E9| | Gorillaz| Rhinestone Eyes| Plastic Beach| album| 0.676| 0.703| 8.0| -5.815| 0.0302| 0.0869| 6.87E-4| 0.0463| 0.852| 92.761| 200173.0| 7.2011645E7|1079128.0| 31003.0| true| 3.10083733E8| | Gorillaz|New Gold (feat. T...|New Gold (feat. T...| single| 0.695| 0.923| 1.0| -3.93| 0.0522| 0.0425| 0.0469| 0.116| 0.551|108.014| 215150.0| 8435055.0| 282142.0| 7399.0| true| 6.3063467E7| | Gorillaz| On Melancholy Hill| Plastic Beach| album| 0.689| 0.739| 2.0| -5.81| 0.026| 1.51E-5| 0.509| 0.064| 0.578|120.423| 233867.0| 2.11754952E8|1788577.0| 55229.0| true| 4.34663559E8| | Gorillaz| Clint Eastwood| Gorillaz| album| 0.663| 0.694|10.0| -8.627| 0.171| 0.0253| 0.0| 0.0698| 0.525|167.953| 340920.0| 6.18480958E8|6197318.0|155930.0| true| 6.17259738E8| | Gorillaz| DARE| Demon Days| album| 0.76| 0.891|11.0| -5.852| 0.0372| 0.0229| 0.0869| 0.298| 0.966|120.264| 245000.0| 2.59021161E8|1844658.0| 72008.0| true| 3.23850327E8| | Gorillaz|New Gold (feat. T...|New Gold (feat. T...| single| 0.716| 0.897| 4.0| -7.185| 0.0629| 0.012| 0.262| 0.325| 0.358| 127.03| 274142.0| 451996.0| 11686.0| 241.0| false| 1.0666154E7| | Gorillaz|She's My Collar (...| Humanz (Deluxe)| album| 0.726| 0.815|11.0| -5.886| 0.0313| 0.00799| 0.081| 0.112| 0.462|140.158| 209560.0| 1010982.0| 17675.0| 260.0| false| 1.59605929E8| | Gorillaz|Cracker Island (f...|Cracker Island (f...| single| 0.741| 0.913| 2.0| -3.34| 0.0465| 0.00343| 0.103| 0.325| 0.643|120.012| 213750.0| 2.445982E7| 739527.0| 20296.0| true| 4.2671901E7| | Gorillaz| Dirty Harry| Demon Days| album| 0.625| 0.877|10.0| -7.176| 0.162| 0.0315| 0.0811| 0.672| 0.865|192.296| 230426.0| 1.54761056E8|1386920.0| 39240.0| true| 1.91074713E8| |Red Hot Chili Pep...| Californication|Californication (...| album| 0.592| 0.767| 9.0| -2.788| 0.027| 0.0021| 0.00165| 0.127| 0.328| 96.483| 329733.0|1.018811259E9|4394471.0|121452.0| true|1.055738398E9| |Red Hot Chili Pep...| Under the Bridge|Blood Sugar Sex M...| album| 0.559| 0.345| 4.0| -13.496| 0.0459| 0.0576| 1.05E-4| 0.141| 0.458| 84.581| 264307.0| 2.46687714E8|1213572.0| 32761.0| true|1.061750522E9| |Red Hot Chili Pep...| Can't Stop|By the Way (Delux...| album| 0.618| 0.938| 9.0| -3.442| 0.0456| 0.0179| 0.0| 0.167| 0.875| 91.455| 269000.0| 3.36635759E8|1740224.0| 32573.0| true| 8.66464951E8| |Red Hot Chili Pep...| Scar Tissue|Californication (...| album| 0.595| 0.717| 0.0| -4.803| 0.0295| 0.0779| 0.00274| 0.108| 0.547| 88.969| 215907.0| 4.3512153E8|1890900.0| 37069.0| true| 6.13838674E8| |Red Hot Chili Pep...| Otherside|Californication (...| album| 0.458| 0.795| 0.0| -3.265| 0.0574| 0.00316| 2.02E-4| 0.0756| 0.513|123.229| 255373.0| 6.73528656E8|3140356.0| 60091.0| true| 7.32774515E8| |Red Hot Chili Pep...| Snow (Hey Oh)| Stadium Arcadium| album| 0.427| 0.9|11.0| -3.674| 0.0499| 0.116| 1.75E-5| 0.119| 0.599|104.655| 334667.0| 3.20871237E8|1272266.0| 37004.0| true| 8.60722316E8| |Red Hot Chili Pep...| Dani California| Stadium Arcadium| album| 0.556| 0.913| 0.0| -2.36| 0.0437| 0.0193| 8.59E-6| 0.346| 0.73| 96.184| 282160.0| 3.24228662E8|1456622.0| 49461.0| true| 5.50067391E8| |Red Hot Chili Pep...| By the Way|By the Way (Delux...| album| 0.451| 0.97| 0.0| -4.938| 0.107| 0.0264| 0.00355| 0.102| 0.198|122.444| 216933.0| 1.79005296E8| 784717.0| 20084.0| true| 3.67485508E8| |Red Hot Chili Pep...| Give It Away|Blood Sugar Sex M...| album| 0.666| 0.936| 7.0| -9.919| 0.0476| 0.00244| 0.086| 0.153| 0.776| 91.577| 282907.0| 8.6637926E7| 434837.0| 16029.0| true| 3.01947159E8| |Red Hot Chili Pep...| Dark Necessities| The Getaway| album| 0.7| 0.742| 5.0| -6.777| 0.0716| 0.0722| 0.0199| 0.11| 0.197| 91.959| 302000.0| 4.40037964E8|2094182.0| 56516.0| true| 3.85677873E8| +--------------------+--------------------+--------------------+----------+------------+------+----+--------+-----------+------------+----------------+--------+-------+-------+-----------+-------------+---------+--------+--------+-------------+ only showing top 20 rows
# Saving the top 600 tracks for later
df_spark.createOrReplaceTempView("table0")
df600_spot = spark.sql('''
SELECT *
FROM table0
ORDER BY Stream DESC
LIMIT 600''').toPandas()
df600_spot
| Artist | Track | Album | Album_type | Danceability | Energy | Key | Loudness | Speechiness | Acousticness | Instrumentalness | Liveness | Valence | Tempo | Duration_ms | Views | Likes | Comments | Licensed | Stream | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | The Weeknd | Blinding Lights | After Hours | album | 0.514 | 0.730 | 1.0 | -5.934 | 0.0598 | 0.00146 | 0.000095 | 0.0897 | 0.334 | 171.005 | 200040.0 | 6.741645e+08 | 8817927.0 | 282589.0 | True | 3.386520e+09 |
| 1 | Ed Sheeran | Shape of You | ÷ (Deluxe) | album | 0.825 | 0.652 | 1.0 | -3.183 | 0.0802 | 0.58100 | 0.000000 | 0.0931 | 0.931 | 95.977 | 233713.0 | 5.908398e+09 | 31047780.0 | 1130327.0 | True | 3.362005e+09 |
| 2 | Lewis Capaldi | Someone You Loved | Divinely Uninspired To A Hellish Extent | album | 0.501 | 0.405 | 1.0 | -5.679 | 0.0319 | 0.75100 | 0.000000 | 0.1050 | 0.446 | 109.891 | 182161.0 | 5.867684e+08 | 7367091.0 | 147565.0 | True | 2.634013e+09 |
| 3 | Post Malone | rockstar (feat. 21 Savage) | beerbongs & bentleys | album | 0.585 | 0.520 | 5.0 | -6.136 | 0.0712 | 0.12400 | 0.000070 | 0.1310 | 0.129 | 159.801 | 218147.0 | 1.060220e+09 | 12564657.0 | 366520.0 | True | 2.594927e+09 |
| 4 | Swae Lee | Sunflower - Spider-Man: Into the Spider-Verse | Hollywood's Bleeding | album | 0.755 | 0.522 | 2.0 | -4.368 | 0.0575 | 0.53300 | 0.000000 | 0.0685 | 0.925 | 89.960 | 157560.0 | 1.977389e+09 | 13749806.0 | 331064.0 | True | 2.538330e+09 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 595 | Migos | Walk It Talk It | Culture II | album | 0.907 | 0.633 | 2.0 | -5.145 | 0.1840 | 0.08760 | 0.000003 | 0.1060 | 0.395 | 145.914 | 276147.0 | 4.020519e+08 | 3589336.0 | 111565.0 | True | 6.754389e+08 |
| 596 | Tove Lo | Habits (Stay High) | Queen Of The Clouds | album | 0.729 | 0.650 | 5.0 | -3.539 | 0.0313 | 0.07020 | 0.000067 | 0.0829 | 0.347 | 110.020 | 209160.0 | 1.012196e+09 | 5376201.0 | 107220.0 | True | 6.749586e+08 |
| 597 | A$AP Rocky | F**kin' Problems (feat. Drake, 2 Chainz & Kend... | LONG.LIVE.A$AP (Deluxe Version) | album | 0.853 | 0.693 | 1.0 | -6.870 | 0.2750 | 0.02390 | 0.000000 | 0.1100 | 0.662 | 95.967 | 233787.0 | 2.962508e+08 | 1955320.0 | 61567.0 | True | 6.736265e+08 |
| 598 | Billie Eilish | Bored | Bored | single | 0.614 | 0.318 | 7.0 | -12.695 | 0.0478 | 0.89600 | 0.002390 | 0.0795 | 0.112 | 119.959 | 180933.0 | 5.467207e+07 | 1089600.0 | 18839.0 | True | 6.728659e+08 |
| 599 | Big Sean | I Don't Fuck With You | Dark Sky Paradise | album | 0.824 | 0.733 | 1.0 | -5.474 | 0.0613 | 0.03620 | 0.000000 | 0.3250 | 0.395 | 97.972 | 284387.0 | 3.823034e+08 | 3293569.0 | 102947.0 | True | 6.715107e+08 |
600 rows × 20 columns
df600_YT = spark.sql('''
SELECT *
FROM table0
ORDER BY Views DESC
LIMIT 600''').toPandas()
df600_YT
| Artist | Track | Album | Album_type | Danceability | Energy | Key | Loudness | Speechiness | Acousticness | Instrumentalness | Liveness | Valence | Tempo | Duration_ms | Views | Likes | Comments | Licensed | Stream | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Daddy Yankee | Despacito | VIDA | album | 0.655 | 0.797 | 2.0 | -4.787 | 0.1530 | 0.19800 | 0.000000 | 0.0670 | 0.839 | 177.928 | 229360.0 | 8.079647e+09 | 50788626.0 | 4252791.0 | True | 1.506598e+09 |
| 1 | Ed Sheeran | Shape of You | ÷ (Deluxe) | album | 0.825 | 0.652 | 1.0 | -3.183 | 0.0802 | 0.58100 | 0.000000 | 0.0931 | 0.931 | 95.977 | 233713.0 | 5.908398e+09 | 31047780.0 | 1130327.0 | True | 3.362005e+09 |
| 2 | Wiz Khalifa | See You Again (feat. Charlie Puth) | See You Again (feat. Charlie Puth) | single | 0.689 | 0.481 | 10.0 | -7.503 | 0.0815 | 0.36900 | 0.000001 | 0.0649 | 0.283 | 80.025 | 229526.0 | 5.773797e+09 | 40147618.0 | 2127345.0 | True | 1.521255e+09 |
| 3 | CoComelon | Wheels on the Bus | CoComelon Kids Hits, Vol. 1 | album | 0.941 | 0.387 | 9.0 | -11.920 | 0.0427 | 0.18400 | 0.000029 | 0.1570 | 0.965 | 125.021 | 207340.0 | 4.898831e+09 | 14396841.0 | 0.0 | True | 8.343436e+07 |
| 4 | Mark Ronson | Uptown Funk (feat. Bruno Mars) | Uptown Special | album | 0.856 | 0.609 | 0.0 | -7.223 | 0.0824 | 0.00801 | 0.000082 | 0.0344 | 0.928 | 114.988 | 269667.0 | 4.821016e+09 | 20067879.0 | 598916.0 | True | 1.653820e+09 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 595 | Nelly Furtado | Promiscuous - Radio Edit | Promiscuous (Radio Edit) | single | 0.786 | 0.979 | 10.0 | -4.489 | 0.0440 | 0.09160 | 0.000021 | 0.3490 | 0.811 | 114.302 | 220867.0 | 5.161938e+08 | 2735269.0 | 76338.0 | True | 1.012045e+07 |
| 596 | CoComelon | Finger Family | CoComelon Kids Hits, Vol. 2 | album | 0.816 | 0.565 | 9.0 | -11.555 | 0.0360 | 0.35200 | 0.000014 | 0.0768 | 0.965 | 119.983 | 136003.0 | 5.160842e+08 | 1714627.0 | 0.0 | True | 2.684379e+07 |
| 597 | Rick Ross | Purple Lamborghini (with Rick Ross) | Purple Lamborghini (with Rick Ross) | single | 0.457 | 0.899 | 10.0 | -2.591 | 0.1320 | 0.00190 | 0.000000 | 0.8310 | 0.298 | 147.268 | 215510.0 | 5.160529e+08 | 4441252.0 | 147715.0 | True | 3.736007e+08 |
| 598 | Prince Royce | Bubalu | Bubalu | compilation | 0.800 | 0.355 | 6.0 | -10.470 | 0.0895 | 0.61600 | 0.000000 | 0.1180 | 0.253 | 145.929 | 228493.0 | 5.145548e+08 | 3010847.0 | 75208.0 | True | 3.349868e+08 |
| 599 | Beyoncé | Love On Top | 4 | album | 0.652 | 0.749 | 0.0 | -5.248 | 0.0886 | 0.08480 | 0.000000 | 0.6040 | 0.651 | 94.103 | 267413.0 | 5.145325e+08 | 2260106.0 | 93092.0 | True | 5.248471e+08 |
600 rows × 20 columns
df_spark.createOrReplaceTempView("table1")
sqlDF = spark.sql('''
SELECT Album_type,
ROUND((COUNT(Album_type)*1.0/(SELECT COUNT(Album_type) FROM table1)*100),2) AS perc
FROM table1
GROUP BY Album_type''').toPandas()
irises_colors = ['rgb(33, 75, 99)', 'rgb(79, 129, 102)', 'rgb(151, 179, 100)']
fig = go.Figure(data=[go.Pie(labels=sqlDF['Album_type'], values=sqlDF['perc'], hole=.5, pull=[0.02, 0.02, 0.02], marker_colors=irises_colors)])
fig.show()
df_spark.createOrReplaceTempView("table2")
sqlDF = spark.sql('''SELECT Artist,
AVG(Danceability) AS Danceability, AVG(Energy) AS Energy, AVG(Key) AS Key,
AVG(Loudness) AS Loudness, AVG(Speechiness) AS Speechiness,
AVG(Acousticness) AS Acousticness, AVG(Instrumentalness) AS Instrumentalness, AVG(Liveness) AS Liveness,
AVG(Valence) AS Valence, AVG(Tempo) AS Tempo, AVG(Duration_ms) AS Duration_ms,
AVG(Views) AS Views, AVG(Likes) AS Likes, AVG(Comments) AS Comments, AVG(Stream) AS Stream
FROM table2
GROUP BY Artist
ORDER BY Views DESC
LIMIT 10;''').toPandas()
sqlDF
23/05/02 18:28:46 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
| Artist | Danceability | Energy | Key | Loudness | Speechiness | Acousticness | Instrumentalness | Liveness | Valence | Tempo | Duration_ms | Views | Likes | Comments | Stream | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Katy Perry | 0.655000 | 0.753000 | 5.250000 | -4.822000 | 0.053737 | 0.039957 | 5.623750e-06 | 0.245125 | 0.565875 | 125.000500 | 221773.750000 | 1.552183e+09 | 7.298340e+06 | 372232.750000 | 6.807561e+08 |
| 1 | Ed Sheeran | 0.740375 | 0.657750 | 4.375000 | -4.809000 | 0.062950 | 0.289125 | 2.311250e-05 | 0.190438 | 0.661375 | 106.736375 | 216951.875000 | 1.354188e+09 | 7.233060e+06 | 233036.250000 | 1.251813e+09 |
| 2 | Dua Lipa | 0.773250 | 0.715500 | 8.250000 | -5.076000 | 0.076700 | 0.016010 | 3.900000e-06 | 0.099475 | 0.678000 | 110.007000 | 203405.250000 | 1.232530e+09 | 8.984291e+06 | 197210.250000 | 1.735381e+09 |
| 3 | CoComelon | 0.766857 | 0.401857 | 4.428571 | -11.320429 | 0.040029 | 0.494857 | 1.900429e-05 | 0.096957 | 0.858143 | 146.948000 | 144556.428571 | 1.202930e+09 | 3.713634e+06 | 0.000000 | 3.756832e+07 |
| 4 | The Weeknd | 0.584167 | 0.637500 | 1.666667 | -6.269333 | 0.085883 | 0.085293 | 1.902500e-05 | 0.224283 | 0.380667 | 142.643500 | 229499.833333 | 1.122235e+09 | 7.641168e+06 | 235070.666667 | 1.858146e+09 |
| 5 | Wiz Khalifa | 0.724667 | 0.651167 | 7.666667 | -5.909333 | 0.071083 | 0.207433 | 1.716667e-07 | 0.180300 | 0.541333 | 136.015000 | 224813.000000 | 1.087298e+09 | 7.409934e+06 | 393581.666667 | 4.948412e+08 |
| 6 | Daddy Yankee | 0.772400 | 0.821200 | 3.900000 | -4.783400 | 0.069220 | 0.133920 | 1.970000e-05 | 0.138710 | 0.682200 | 103.704700 | 206795.900000 | 1.087193e+09 | 6.551817e+06 | 467212.800000 | 5.135462e+08 |
| 7 | Enrique Iglesias | 0.671250 | 0.733125 | 5.375000 | -4.887000 | 0.060625 | 0.222550 | 5.102250e-05 | 0.114375 | 0.634500 | 113.234125 | 231301.750000 | 1.040749e+09 | 3.578378e+06 | 99921.125000 | 3.355495e+08 |
| 8 | DJ Snake | 0.629667 | 0.798500 | 6.333333 | -4.531000 | 0.103133 | 0.035372 | 1.370779e-01 | 0.192700 | 0.295167 | 124.661000 | 207896.833333 | 1.031645e+09 | 6.536074e+06 | 184021.333333 | 7.663337e+08 |
| 9 | Macklemore & Ryan Lewis | 0.612000 | 0.568000 | 4.000000 | -8.555000 | 0.074300 | 0.523000 | 0.000000e+00 | 0.217000 | 0.230000 | 116.247000 | 346164.000000 | 1.012206e+09 | 6.604141e+06 | 242519.000000 | 5.662208e+07 |
metrics = ['Views', 'Likes', 'Comments', 'Stream', 'Danceability', 'Energy', 'Key', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence','Tempo', 'Duration_ms']
num_metrics = len(metrics)
cols = 2
rows = (num_metrics + cols - 1) // cols
fig = sp.make_subplots(rows=rows, cols=cols, subplot_titles=metrics, vertical_spacing=0.03, horizontal_spacing=0.035)
for i, metric in enumerate(metrics):
row = i // cols + 1
col = i % cols + 1
trace = go.Bar(x=sqlDF[sqlDF.columns[0]], y=sqlDF[metric], name=metric,
text=sqlDF[metric], textposition='outside', texttemplate='%{text:.2f}',
textfont=dict(size=8))
fig.add_trace(trace, row=row, col=col)
fig.update_layout( title_text='Average Metrics by Top Artists', title_x=0.5, font=dict(size=9),
height=400 * rows, width=500 * cols, showlegend=False)
for i in fig['layout']['annotations']:
i['font'] = dict(size=14)
fig.show()
df_spark.createOrReplaceTempView("table3")
sqlDF = spark.sql('''SELECT Artist,
AVG(Danceability) AS Danceability, AVG(Energy) AS Energy, AVG(Key) AS Key,
AVG(Loudness) AS Loudness, AVG(Speechiness) AS Speechiness,
AVG(Acousticness) AS Acousticness, AVG(Instrumentalness) AS Instrumentalness, AVG(Liveness) AS Liveness,
AVG(Valence) AS Valence, AVG(Tempo) AS Tempo, AVG(Duration_ms) AS Duration_ms,
AVG(Views) AS Views, AVG(Likes) AS Likes, AVG(Comments) AS Comments, AVG(Stream) AS Stream
FROM table3
GROUP BY Artist
ORDER BY Stream DESC
LIMIT 10;''').toPandas()
sqlDF
| Artist | Danceability | Energy | Key | Loudness | Speechiness | Acousticness | Instrumentalness | Liveness | Valence | Tempo | Duration_ms | Views | Likes | Comments | Stream | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Drake | 0.773000 | 0.537000 | 4.000000 | -7.410000 | 0.081300 | 0.020480 | 0.000941 | 0.440500 | 0.363500 | 90.568000 | 186480.000000 | 8.185059e+08 | 8.453772e+06 | 306442.000000 | 2.312661e+09 |
| 1 | The Weeknd | 0.584167 | 0.637500 | 1.666667 | -6.269333 | 0.085883 | 0.085293 | 0.000019 | 0.224283 | 0.380667 | 142.643500 | 229499.833333 | 1.122235e+09 | 7.641168e+06 | 235070.666667 | 1.858146e+09 |
| 2 | Dua Lipa | 0.773250 | 0.715500 | 8.250000 | -5.076000 | 0.076700 | 0.016010 | 0.000004 | 0.099475 | 0.678000 | 110.007000 | 203405.250000 | 1.232530e+09 | 8.984291e+06 | 197210.250000 | 1.735381e+09 |
| 3 | Post Malone | 0.653571 | 0.622000 | 5.714286 | -5.355714 | 0.075086 | 0.192029 | 0.000359 | 0.122471 | 0.366571 | 130.840286 | 200737.428571 | 4.079678e+08 | 4.284726e+06 | 108027.571429 | 1.513507e+09 |
| 4 | XXXTENTACION | 0.780250 | 0.524375 | 5.000000 | -8.196125 | 0.151400 | 0.400712 | 0.001613 | 0.135825 | 0.430125 | 124.639125 | 138198.625000 | 3.038998e+08 | 5.270598e+06 | 373614.125000 | 1.368585e+09 |
| 5 | Justin Bieber | 0.614000 | 0.589571 | 2.428571 | -6.922857 | 0.111143 | 0.411000 | 0.000004 | 0.242914 | 0.535429 | 123.849714 | 181000.000000 | 8.082079e+08 | 5.752261e+06 | 255620.285714 | 1.313070e+09 |
| 6 | Shawn Mendes | 0.679400 | 0.622600 | 6.600000 | -7.199600 | 0.048980 | 0.221520 | 0.000029 | 0.086100 | 0.605320 | 121.408200 | 206256.000000 | 9.142391e+08 | 8.889465e+06 | 283394.600000 | 1.294543e+09 |
| 7 | Ed Sheeran | 0.740375 | 0.657750 | 4.375000 | -4.809000 | 0.062950 | 0.289125 | 0.000023 | 0.190438 | 0.661375 | 106.736375 | 216951.875000 | 1.354188e+09 | 7.233060e+06 | 233036.250000 | 1.251813e+09 |
| 8 | Khalid | 0.645778 | 0.573667 | 3.111111 | -7.611111 | 0.132578 | 0.309900 | 0.037364 | 0.121722 | 0.343667 | 111.576778 | 203433.333333 | 5.970037e+08 | 5.162533e+06 | 110469.333333 | 1.242641e+09 |
| 9 | Coldplay | 0.472700 | 0.570400 | 4.800000 | -7.357100 | 0.030770 | 0.207006 | 0.005588 | 0.177930 | 0.290600 | 126.111200 | 261958.300000 | 9.997278e+08 | 6.315790e+06 | 212437.000000 | 1.177848e+09 |
metrics = ['Stream', 'Views', 'Likes', 'Comments', 'Danceability', 'Energy', 'Key', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence','Tempo', 'Duration_ms']
fig = sp.make_subplots(rows=rows, cols=cols, subplot_titles=metrics, vertical_spacing=0.03, horizontal_spacing=0.035)
for i, metric in enumerate(metrics):
row = i // cols + 1
col = i % cols + 1
trace = go.Bar(x=sqlDF[sqlDF.columns[0]], y=sqlDF[metric], name=metric,
text=sqlDF[metric], textposition='outside', texttemplate='%{text:.2f}',
textfont=dict(size=8))
fig.add_trace(trace, row=row, col=col)
fig.update_layout( title_text='Average Metrics by Top Artists', title_x=0.5, font=dict(size=9),
height=400 * rows, width=500 * cols, showlegend=False)
for i in fig['layout']['annotations']:
i['font'] = dict(size=14)
fig.show()
df_spark.createOrReplaceTempView("table4")
sqlDF = spark.sql('''SELECT *
FROM table4
ORDER BY Views DESC
LIMIT 10;''').toPandas()
sqlDF
| Artist | Track | Album | Album_type | Danceability | Energy | Key | Loudness | Speechiness | Acousticness | Instrumentalness | Liveness | Valence | Tempo | Duration_ms | Views | Likes | Comments | Licensed | Stream | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Daddy Yankee | Despacito | VIDA | album | 0.655 | 0.797 | 2.0 | -4.787 | 0.1530 | 0.19800 | 0.000000 | 0.0670 | 0.839 | 177.928 | 229360.0 | 8.079647e+09 | 50788626.0 | 4252791.0 | True | 1.506598e+09 |
| 1 | Ed Sheeran | Shape of You | ÷ (Deluxe) | album | 0.825 | 0.652 | 1.0 | -3.183 | 0.0802 | 0.58100 | 0.000000 | 0.0931 | 0.931 | 95.977 | 233713.0 | 5.908398e+09 | 31047780.0 | 1130327.0 | True | 3.362005e+09 |
| 2 | Wiz Khalifa | See You Again (feat. Charlie Puth) | See You Again (feat. Charlie Puth) | single | 0.689 | 0.481 | 10.0 | -7.503 | 0.0815 | 0.36900 | 0.000001 | 0.0649 | 0.283 | 80.025 | 229526.0 | 5.773797e+09 | 40147618.0 | 2127345.0 | True | 1.521255e+09 |
| 3 | CoComelon | Wheels on the Bus | CoComelon Kids Hits, Vol. 1 | album | 0.941 | 0.387 | 9.0 | -11.920 | 0.0427 | 0.18400 | 0.000029 | 0.1570 | 0.965 | 125.021 | 207340.0 | 4.898831e+09 | 14396841.0 | 0.0 | True | 8.343436e+07 |
| 4 | Mark Ronson | Uptown Funk (feat. Bruno Mars) | Uptown Special | album | 0.856 | 0.609 | 0.0 | -7.223 | 0.0824 | 0.00801 | 0.000082 | 0.0344 | 0.928 | 114.988 | 269667.0 | 4.821016e+09 | 20067879.0 | 598916.0 | True | 1.653820e+09 |
| 5 | PSY | Gangnam Style (강남스타일) | Gangnam Style (강남스타일) | single | 0.727 | 0.937 | 11.0 | -2.871 | 0.2860 | 0.00417 | 0.000000 | 0.0910 | 0.749 | 132.067 | 219493.0 | 4.679767e+09 | 26399133.0 | 5331537.0 | False | 3.709911e+08 |
| 6 | Katy Perry | Roar | PRISM | album | 0.671 | 0.771 | 7.0 | -4.821 | 0.0316 | 0.00492 | 0.000007 | 0.3540 | 0.436 | 90.003 | 223546.0 | 3.725749e+09 | 15864499.0 | 763366.0 | True | 8.847210e+08 |
| 7 | OneRepublic | Counting Stars | Native | album | 0.664 | 0.705 | 1.0 | -4.972 | 0.0382 | 0.06540 | 0.000000 | 0.1150 | 0.477 | 122.017 | 257840.0 | 3.721610e+09 | 16558621.0 | 475191.0 | True | 1.805320e+09 |
| 8 | Justin Bieber | Sorry | Purpose (Deluxe) | album | 0.654 | 0.760 | 0.0 | -3.669 | 0.0450 | 0.07970 | 0.000000 | 0.2990 | 0.410 | 99.945 | 200787.0 | 3.627306e+09 | 15789307.0 | 865675.0 | True | 1.740759e+09 |
| 9 | Ed Sheeran | Thinking out Loud | x (Wembley Edition) | album | 0.781 | 0.445 | 2.0 | -6.061 | 0.0295 | 0.47400 | 0.000000 | 0.1840 | 0.591 | 78.998 | 281560.0 | 3.547156e+09 | 14343730.0 | 362545.0 | True | 2.154334e+09 |
df_spark.createOrReplaceTempView("table5")
sqlDF = spark.sql('''SELECT *
FROM table5
ORDER BY Stream DESC
LIMIT 10;''').toPandas()
sqlDF
| Artist | Track | Album | Album_type | Danceability | Energy | Key | Loudness | Speechiness | Acousticness | Instrumentalness | Liveness | Valence | Tempo | Duration_ms | Views | Likes | Comments | Licensed | Stream | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | The Weeknd | Blinding Lights | After Hours | album | 0.514 | 0.730 | 1.0 | -5.934 | 0.0598 | 0.00146 | 0.000095 | 0.0897 | 0.334 | 171.005 | 200040.0 | 6.741645e+08 | 8817927.0 | 282589.0 | True | 3.386520e+09 |
| 1 | Ed Sheeran | Shape of You | ÷ (Deluxe) | album | 0.825 | 0.652 | 1.0 | -3.183 | 0.0802 | 0.58100 | 0.000000 | 0.0931 | 0.931 | 95.977 | 233713.0 | 5.908398e+09 | 31047780.0 | 1130327.0 | True | 3.362005e+09 |
| 2 | Lewis Capaldi | Someone You Loved | Divinely Uninspired To A Hellish Extent | album | 0.501 | 0.405 | 1.0 | -5.679 | 0.0319 | 0.75100 | 0.000000 | 0.1050 | 0.446 | 109.891 | 182161.0 | 5.867684e+08 | 7367091.0 | 147565.0 | True | 2.634013e+09 |
| 3 | Post Malone | rockstar (feat. 21 Savage) | beerbongs & bentleys | album | 0.585 | 0.520 | 5.0 | -6.136 | 0.0712 | 0.12400 | 0.000070 | 0.1310 | 0.129 | 159.801 | 218147.0 | 1.060220e+09 | 12564657.0 | 366520.0 | True | 2.594927e+09 |
| 4 | Swae Lee | Sunflower - Spider-Man: Into the Spider-Verse | Hollywood's Bleeding | album | 0.755 | 0.522 | 2.0 | -4.368 | 0.0575 | 0.53300 | 0.000000 | 0.0685 | 0.925 | 89.960 | 157560.0 | 1.977389e+09 | 13749806.0 | 331064.0 | True | 2.538330e+09 |
| 5 | Drake | One Dance | Views | album | 0.792 | 0.625 | 1.0 | -5.609 | 0.0536 | 0.00776 | 0.001800 | 0.3290 | 0.370 | 103.967 | 173987.0 | 1.692883e+08 | 1662640.0 | 13775.0 | False | 2.522432e+09 |
| 6 | Imagine Dragons | Believer | Evolve | album | 0.776 | 0.780 | 10.0 | -4.374 | 0.1280 | 0.06220 | 0.000000 | 0.0810 | 0.666 | 124.949 | 204347.0 | 2.369715e+09 | 20483444.0 | 613230.0 | True | 2.369272e+09 |
| 7 | Justin Bieber | STAY (with Justin Bieber) | F*CK LOVE 3+: OVER YOU | album | 0.591 | 0.764 | 1.0 | -5.484 | 0.0483 | 0.03830 | 0.000000 | 0.1030 | 0.478 | 169.928 | 141806.0 | 6.812056e+08 | 10131328.0 | 247007.0 | True | 2.365778e+09 |
| 8 | Shawn Mendes | Señorita | Shawn Mendes (Deluxe) | album | 0.759 | 0.548 | 9.0 | -6.049 | 0.0290 | 0.03920 | 0.000000 | 0.0828 | 0.749 | 116.967 | 190800.0 | 1.487649e+09 | 19846114.0 | 640320.0 | True | 2.336220e+09 |
| 9 | Glass Animals | Heat Waves | Dreamland (+ Bonus Levels) | album | 0.761 | 0.525 | 11.0 | -6.900 | 0.0944 | 0.44000 | 0.000007 | 0.0921 | 0.531 | 80.870 | 238805.0 | 4.800890e+08 | 7145914.0 | 140345.0 | True | 2.261464e+09 |
fig = px.scatter(df, x=df['Danceability'], y=df['Views'], color=df['Energy'], hover_data=['Track'])
fig.show()
fig = px.scatter(df, x=df['Speechiness'], y=df['Views'], color = df['Energy'], hover_data=['Track'])
fig.show()
fig = px.scatter(df, x=df['Tempo'], y=df['Views'], hover_data=['Track'])
fig.show()
fig = px.scatter(df, x=df['Valence'], y=df['Stream'], color=df['Energy'], hover_data=['Track'])
fig.show()
fig = px.scatter(df, x=df['Danceability'], y=df['Tempo'],
color=df['Stream'], trendline = 'ols')
fig.show()
fig = px.scatter(df, x=df['Loudness'], y=df['Views'], color=df['Danceability'], hover_data=['Track'])
fig.show()
df['V/L_Ratio'] = df['Likes']/ df['Views']
df['V/L_Ratio'] = df['V/L_Ratio'].apply(lambda x: round(x, 4))
vl = df.sort_values(by = 'V/L_Ratio', ascending = False).head(20)[['Track', 'Artist', 'Views', 'Likes', 'V/L_Ratio']]
fig = px.bar(vl, x=vl['Track'], y=vl['V/L_Ratio'],
hover_data=['Artist'])
fig.show()
df_spark.createOrReplaceTempView("table6")
sqlDF = spark.sql('''SELECT Album, Artist, SUM(Stream) AS Streams
FROM table6
GROUP BY Album, Artist
ORDER BY SUM(Stream) DESC
LIMIT 15;''').toPandas()
sqlDF
fig = px.line(sqlDF, x=sqlDF['Album'], y=sqlDF['Streams'],
hover_data=['Artist'], markers=True)
fig.show()
df_spark.createOrReplaceTempView("table7")
sqlDF = spark.sql('''SELECT Artist, SUM(Likes) AS Total_Likes
FROM table6
GROUP BY Artist
ORDER BY SUM(Likes) DESC
LIMIT 15;''').toPandas()
sqlDF
fig = px.bar(sqlDF, x=sqlDF['Artist'], y=sqlDF['Total_Likes'])
fig.show()
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
df
| Artist | Track | Album | Album_type | Danceability | Energy | Key | Loudness | Speechiness | Acousticness | ... | Liveness | Valence | Tempo | Duration_ms | Views | Likes | Comments | Licensed | Stream | V/L_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Gorillaz | Feel Good Inc. | Demon Days | album | 0.818 | 0.705 | 6.0 | -6.679 | 0.1770 | 0.008360 | ... | 0.6130 | 0.7720 | 138.559 | 222640.0 | 693555221.0 | 6220896.0 | 169907.0 | True | 1.040235e+09 | 0.0090 |
| 1 | Gorillaz | Rhinestone Eyes | Plastic Beach | album | 0.676 | 0.703 | 8.0 | -5.815 | 0.0302 | 0.086900 | ... | 0.0463 | 0.8520 | 92.761 | 200173.0 | 72011645.0 | 1079128.0 | 31003.0 | True | 3.100837e+08 | 0.0150 |
| 2 | Gorillaz | New Gold (feat. Tame Impala and Bootie Brown) | New Gold (feat. Tame Impala and Bootie Brown) | single | 0.695 | 0.923 | 1.0 | -3.930 | 0.0522 | 0.042500 | ... | 0.1160 | 0.5510 | 108.014 | 215150.0 | 8435055.0 | 282142.0 | 7399.0 | True | 6.306347e+07 | 0.0334 |
| 3 | Gorillaz | On Melancholy Hill | Plastic Beach | album | 0.689 | 0.739 | 2.0 | -5.810 | 0.0260 | 0.000015 | ... | 0.0640 | 0.5780 | 120.423 | 233867.0 | 211754952.0 | 1788577.0 | 55229.0 | True | 4.346636e+08 | 0.0084 |
| 4 | Gorillaz | Clint Eastwood | Gorillaz | album | 0.663 | 0.694 | 10.0 | -8.627 | 0.1710 | 0.025300 | ... | 0.0698 | 0.5250 | 167.953 | 340920.0 | 618480958.0 | 6197318.0 | 155930.0 | True | 6.172597e+08 | 0.0100 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20713 | SICK LEGEND | JUST DANCE HARDSTYLE | JUST DANCE HARDSTYLE | single | 0.582 | 0.926 | 5.0 | -6.344 | 0.0328 | 0.448000 | ... | 0.0839 | 0.6580 | 90.002 | 94667.0 | 71678.0 | 1113.0 | 0.0 | True | 9.227144e+06 | 0.0155 |
| 20714 | SICK LEGEND | SET FIRE TO THE RAIN HARDSTYLE | SET FIRE TO THE RAIN HARDSTYLE | single | 0.531 | 0.936 | 4.0 | -1.786 | 0.1370 | 0.028000 | ... | 0.0923 | 0.6570 | 174.869 | 150857.0 | 164741.0 | 2019.0 | 0.0 | True | 1.089818e+07 | 0.0123 |
| 20715 | SICK LEGEND | OUTSIDE HARDSTYLE SPED UP | OUTSIDE HARDSTYLE SPED UP | single | 0.443 | 0.830 | 4.0 | -4.679 | 0.0647 | 0.024300 | ... | 0.1540 | 0.4190 | 168.388 | 136842.0 | 35646.0 | 329.0 | 0.0 | True | 6.226110e+06 | 0.0092 |
| 20716 | SICK LEGEND | ONLY GIRL HARDSTYLE | ONLY GIRL HARDSTYLE | single | 0.417 | 0.767 | 9.0 | -4.004 | 0.4190 | 0.356000 | ... | 0.1080 | 0.5390 | 155.378 | 108387.0 | 6533.0 | 88.0 | 0.0 | True | 6.873961e+06 | 0.0135 |
| 20717 | SICK LEGEND | MISS YOU HARDSTYLE | MISS YOU HARDSTYLE | single | 0.498 | 0.938 | 6.0 | -4.543 | 0.1070 | 0.002770 | ... | 0.1360 | 0.0787 | 160.067 | 181500.0 | 158697.0 | 2484.0 | 0.0 | True | 5.695584e+06 | 0.0157 |
16866 rows × 21 columns
df.columns
Index(['Artist', 'Track', 'Album', 'Album_type', 'Danceability', 'Energy',
'Key', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness',
'Liveness', 'Valence', 'Tempo', 'Duration_ms', 'Views', 'Likes',
'Comments', 'Licensed', 'Stream', 'V/L_Ratio'],
dtype='object')
# Drop unnecessary columns like track name, artist, album
# Scale the features using StandardScaler
# Split the dataset into training and test sets
X = df.drop(['Artist', 'Track', 'Album', 'Album_type', 'Likes', 'Comments', 'Licensed', 'Stream', 'Views', 'V/L_Ratio'], axis=1)
X = X.values
y = df['Views']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Training the linear regression model
reg = LinearRegression()
reg.fit(X_train, y_train)
# Evaluate the model on the test set
y_pred = reg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Mean squared error:', mse)
print('Mean absolute error:', mae)
print('R-squared:', r2)
# Making predictions on new data
new_data = [[0.7, 0.3, 5, -4, 0.2, 0.01, 0.004, 0.09, 0.5, 120, 233347.0],
[0.5, 0.25, 7, -5.7, 0.35, 0.06, 0.008, 0.1, 0.8, 78, 363262.0]]
new_preds = reg.predict(new_data)
print('Predictions:', new_preds)
Mean squared error: 6.269089705294543e+16 Mean absolute error: 109784142.56687756 R-squared: 0.020073031150982357 Predictions: [1.44411764e+08 1.24250442e+08]
regRF = RandomForestRegressor(n_estimators=100, random_state=42)
regRF.fit(X_train, y_train)
# Evaluate the model on the test set
y_pred = regRF.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Mean squared error:', mse)
print('Mean absolute error:', mae)
print('R-squared:', r2)
# Make predictions on new data
new_data = [[0.7, 0.3, 5, -4, 0.2, 0.01, 0.004, 0.09, 0.5, 120, 233347.0],
[0.5, 0.25, 7, -5.7, 0.35, 0.06, 0.008, 0.1, 0.8, 78, 363262.0]]
new_preds = regRF.predict(new_data)
print('Predictions:', new_preds)
Mean squared error: 6.5106613912709176e+16 Mean absolute error: 118230420.04959916 R-squared: -0.01768725321672293 Predictions: [3.06437729e+08 3.27065231e+08]